# Imports
import os
import pandas as pd
import csv
import kaggle

# other imports
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from copy import copy
import seaborn as sns
from scipy.stats import norm
import matplotlib.dates as mdates
# import matplotlib.colors as mcolors
# import random
# import math
# import time
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
# from sklearn.svm import SVR
from datetime import date, datetime
from dateutil.parser import parse
import us
# import operator 
# plt.style.use('fivethirtyeight')
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline 
c:\programdata\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:144: FutureWarning: The sklearn.utils.testing module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.utils. Anything that cannot be imported from sklearn.utils is now part of the private API.
  warnings.warn(message, FutureWarning)

Covid Tracking Dataset (w/ hospitalised data)

Source: https://covidtracking.com/

Step 1: Load and Clean the Data

all_cases = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')

# Delete unecessary rows
for row in ['negative', 'pending', 'hash', 'negativeIncrease', 'totalTestResults', 'totalTestResultsIncrease', 'dateChecked', 'fips', 'inIcuCumulative', 'onVentilatorCumulative', 'total', 'posNeg', 'deathIncrease', 'hospitalizedIncrease', 'positiveIncrease']:
    del all_cases[row]

# TODO missing values
#      Do we get avg or missing values, or predict them?
#      See https://developerzen.com/data-mining-handling-missing-values-the-database-bd2241882e72

for i, row in all_cases.iterrows():
    # Set Dates
    s = str(row['date'])
    all_cases.at[i, 'date'] = date(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]))

# Missing death figures means no death reports yet
# These are set to 0
for i, row in all_cases.iterrows():
    if np.isnan(row['death']):
        all_cases.at[i, 'death'] = 0

Missing values: Retrieving from other datasets or through merging columns (or both)

The following will be done:

  • Active Cases: Retrieved from JHU dataset and calculating $active = pos-dead-recovered$
  • Beds per State: Retrieved from External Datasets
# TODO Replace active cases with JHU and/or regression model (Selma)
all_cases['active'] = all_cases['positive'] - all_cases['recovered'] - all_cases['death']
# change location of 'active' column
cols = list(all_cases)
cols.insert(3, cols.pop(cols.index('active')))
all_cases = all_cases.loc[:, cols]
# Load datasets for US population and Hospital beds per 1000
us_population = pd.read_csv('data/us_population.csv')
hosp_beds = pd.read_csv('data/hospital_beds.csv')
state_abbrev = pd.read_csv('data/us_state_names.csv')

# add state abbreviations to us_population and hospital beds dataframe
for state in state_abbrev['State'].tolist():
    # store state abbreviation in variable
    abbrev = state_abbrev.loc[state_abbrev['State'] == state, 'Abbreviation'].tolist()[0]
    # add abbrev to new column 'Abbreviation' in us_population df
    us_population.loc[us_population['State'] == state, 'Abbreviation'] = abbrev
    # add abbrev to new column in hosp_beds df
    hosp_beds.loc[hosp_beds['Location'] == state, 'Abbreviation'] = abbrev
    
# change order of columns of us_population
cols = list(us_population)
cols.insert(2, cols.pop(cols.index('Abbreviation')))
us_population = us_population.loc[:, cols]

# drop unnecessary columns of us_population
us_population = us_population.drop(columns=['rank', 'Growth', 'Pop2018', 'Pop2010', 'growthSince2010', 'Percent', 'density'])

# drop unnecessary columns of hosp_beds
hosp_beds = hosp_beds.drop(columns=['Location', 'State/Local Government', 'Non-Profit', 'For-Profit'])

# change order of columns of hosp_beds
cols = list(hosp_beds)
cols.insert(0, cols.pop(cols.index('Abbreviation')))
hosp_beds = hosp_beds.loc[:, cols]
us_population.head()
State Abbreviation Pop
0 Alabama AL 4908621
1 Alaska AK 734002
2 Arizona AZ 7378494
3 Arkansas AR 3038999
4 California CA 39937489
hosp_beds.head()
Abbreviation Total
0 NaN 2.4
1 AL 3.1
2 AK 2.2
3 AZ 1.9
4 AR 3.2
# filter out non-existing states like 'AS'
all_cases = all_cases[all_cases['state'].isin(state_abbrev['Abbreviation'].tolist())]
# see what filtered dataframe looks like
all_cases.head()
date state positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently recovered dataQualityGrade ... totalTestsViral positiveTestsViral negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade
0 2020-06-26 AK 836.0 303.0 12.0 NaN NaN 2.0 519.0 A ... 101792.0 NaN NaN NaN 0 0 0 0 0 NaN
1 2020-06-26 AL 34183.0 14410.0 658.0 2653.0 NaN NaN 18866.0 B ... NaN NaN NaN 33717.0 0 0 0 0 0 NaN
2 2020-06-26 AR 18062.0 5695.0 284.0 1300.0 NaN 61.0 12127.0 A ... NaN NaN NaN 18740.0 0 0 0 0 0 NaN
4 2020-06-26 AZ 66548.0 56624.0 2110.0 4514.0 581.0 312.0 8389.0 A+ ... 479330.0 NaN NaN 66055.0 0 0 0 0 0 NaN
5 2020-06-26 CA 200461.0 NaN 5639.0 NaN 1570.0 NaN NaN B ... 3771314.0 NaN NaN 200461.0 0 0 0 0 0 NaN

5 rows × 25 columns

# check which states have 0 positive cases
all_cases.loc[all_cases['positive'] == 0]
date state positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently recovered dataQualityGrade ... totalTestsViral positiveTestsViral negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade
5710 2020-03-17 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5766 2020-03-16 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5817 2020-03-15 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5868 2020-03-14 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5883 2020-03-13 ID 0.0 NaN NaN NaN NaN NaN NaN NaN ... 131.0 NaN NaN NaN 0 0 0 0 0 NaN
5919 2020-03-13 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5921 2020-03-12 AK 0.0 NaN NaN NaN NaN NaN NaN NaN ... 46.0 NaN NaN NaN 0 0 0 0 0 NaN
5922 2020-03-12 AL 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5934 2020-03-12 ID 0.0 NaN NaN NaN NaN NaN NaN NaN ... 93.0 NaN NaN NaN 0 0 0 0 0 NaN
5970 2020-03-12 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5972 2020-03-11 AK 0.0 NaN NaN NaN NaN NaN NaN NaN ... 46.0 NaN NaN NaN 0 0 0 0 0 NaN
5973 2020-03-11 AL 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5974 2020-03-11 AR 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5985 2020-03-11 ID 0.0 NaN NaN NaN NaN NaN NaN NaN ... 67.0 NaN NaN NaN 0 0 0 0 0 NaN
5992 2020-03-11 ME 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5996 2020-03-11 MS 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5997 2020-03-11 MT 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
5999 2020-03-11 ND 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6020 2020-03-11 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6021 2020-03-11 WY 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6022 2020-03-10 AK 0.0 NaN NaN NaN NaN NaN NaN NaN ... 23.0 NaN NaN NaN 0 0 0 0 0 NaN
6023 2020-03-10 AL 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6024 2020-03-10 AR 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6030 2020-03-10 DE 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6035 2020-03-10 ID 0.0 NaN NaN NaN NaN NaN NaN NaN ... 41.0 NaN NaN NaN 0 0 0 0 0 NaN
6042 2020-03-10 ME 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6046 2020-03-10 MS 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6047 2020-03-10 MT 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6049 2020-03-10 ND 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6053 2020-03-10 NM 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6197 2020-03-07 MT 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6199 2020-03-07 ND 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6203 2020-03-07 NM 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6206 2020-03-07 OH 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 0.0 0 0 0 0 0 NaN
6212 2020-03-07 SD 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6216 2020-03-07 VA 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6220 2020-03-07 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6221 2020-03-07 WY 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6222 2020-03-06 AK 0.0 NaN NaN NaN NaN NaN NaN NaN ... 8.0 NaN NaN NaN 0 0 0 0 0 NaN
6223 2020-03-06 AR 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6227 2020-03-06 DC 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6228 2020-03-06 DE 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6231 2020-03-06 IA 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6234 2020-03-06 KS 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6235 2020-03-06 KY 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6240 2020-03-06 NE 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6243 2020-03-06 NM 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6246 2020-03-06 OH 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 0.0 0 0 0 0 0 NaN
6250 2020-03-06 SC 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6253 2020-03-06 VA 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6254 2020-03-06 VT 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6257 2020-03-06 WV 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6260 2020-03-05 CO 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6261 2020-03-05 DC 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6265 2020-03-05 MD 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6268 2020-03-05 NE 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6273 2020-03-05 OH 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN 0.0 0 0 0 0 0 NaN
6276 2020-03-05 SC 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6279 2020-03-05 VA 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN
6293 2020-03-04 SC 0.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0 0 0 0 0 NaN

105 rows × 25 columns

# Split dataframes by date
df_split_by_date = dict(tuple(all_cases.groupby('date')))

# Split dataframes by state
df_split_by_state = dict(tuple(all_cases.groupby('state')))
# merge dataframes us_population and all_cases
df_merge_uspop = all_cases.merge(us_population, how='left', left_on='state', right_on='Abbreviation')
df_merge_uspop = df_merge_uspop.drop(columns=['Abbreviation'])
df_merge_uspop = df_merge_uspop.rename(columns={'Pop': 'population'})

# change location of 'population' column
cols = list(df_merge_uspop)
cols.insert(2, cols.pop(cols.index('population')))
df_merge_uspop = df_merge_uspop.loc[:, cols]

# merge dataframes hosp_beds and df_merge_uspop
df_merge_hosp = df_merge_uspop.merge(hosp_beds, how='left', left_on='state', right_on='Abbreviation')
df_merge_hosp = df_merge_hosp.drop(columns=['Abbreviation'])
all_cases = df_merge_hosp.rename(columns={'Total': 'bedsPerThousand'})
all_cases.head()
date state population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently recovered ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade State bedsPerThousand
0 2020-06-26 AK 734002 836.0 303.0 12.0 NaN NaN 2.0 519.0 ... NaN NaN 0 0 0 0 0 NaN Alaska 2.2
1 2020-06-26 AL 4908621 34183.0 14410.0 658.0 2653.0 NaN NaN 18866.0 ... NaN 33717.0 0 0 0 0 0 NaN Alabama 3.1
2 2020-06-26 AR 3038999 18062.0 5695.0 284.0 1300.0 NaN 61.0 12127.0 ... NaN 18740.0 0 0 0 0 0 NaN Arkansas 3.2
3 2020-06-26 AZ 7378494 66548.0 56624.0 2110.0 4514.0 581.0 312.0 8389.0 ... NaN 66055.0 0 0 0 0 0 NaN Arizona 1.9
4 2020-06-26 CA 39937489 200461.0 NaN 5639.0 NaN 1570.0 NaN NaN ... NaN 200461.0 0 0 0 0 0 NaN California 1.8

5 rows × 28 columns

# Calculate the total beds, and add the column
all_cases['total_beds'] = all_cases['population'] / 1000 * all_cases['bedsPerThousand']
# change abbreviations to state names
all_cases = all_cases.rename(columns={'state': 'abbrev'})
all_cases = all_cases.rename(columns={'State': 'state'})
# change location of 'state' column
cols = list(all_cases)
cols.insert(1, cols.pop(cols.index('state')))
all_cases = all_cases.loc[:, cols]
all_cases.head()
date state abbrev population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
0 2020-06-26 Alaska AK 734002 836.0 303.0 12.0 NaN NaN 2.0 ... NaN NaN 0 0 0 0 0 NaN 2.2 1614.8044
1 2020-06-26 Alabama AL 4908621 34183.0 14410.0 658.0 2653.0 NaN NaN ... NaN 33717.0 0 0 0 0 0 NaN 3.1 15216.7251
2 2020-06-26 Arkansas AR 3038999 18062.0 5695.0 284.0 1300.0 NaN 61.0 ... NaN 18740.0 0 0 0 0 0 NaN 3.2 9724.7968
3 2020-06-26 Arizona AZ 7378494 66548.0 56624.0 2110.0 4514.0 581.0 312.0 ... NaN 66055.0 0 0 0 0 0 NaN 1.9 14019.1386
4 2020-06-26 California CA 39937489 200461.0 NaN 5639.0 NaN 1570.0 NaN ... NaN 200461.0 0 0 0 0 0 NaN 1.8 71887.4802

5 rows × 29 columns

  • Load and clean JHU data
  • Merge JHU dataset with main dataset
# This cell takes some time, as it needs to connect to Kaggle Servers to retrieve data
kaggle.api.authenticate()
kaggle.api.dataset_download_files('benhamner/jhucovid19', path='./kaggle/input/jhucovid19/', unzip=True)
# Get Time-Series Data of cases as Pandas DataFrame
dir_jhu = './kaggle/input/jhucovid19/csse_covid_19_data/csse_covid_19_daily_reports'

df_list = []
for dirname, _, files in os.walk(dir_jhu):
    for file in files:
        if 'gitignore' not in file and 'README' not in file:
            full_dir = os.path.join(dirname, file)
            df_list.append(pd.read_csv(full_dir))
            
jhu_df = pd.concat(df_list, axis=0, ignore_index=True, sort=True)

# convert Last Update columns to datetime format
jhu_df.loc[:, 'Last Update'] = pd.to_datetime(jhu_df['Last Update']).apply(lambda x: x.date())
jhu_df.loc[:, 'Last_Update'] = pd.to_datetime(jhu_df['Last_Update']).apply(lambda x: x.date())

# Combine Last Update with Last_Update
jhu_df['LastUpdate'] = jhu_df['Last_Update'].combine_first(jhu_df['Last Update'])

# Combine Country/Region with Country_Region
jhu_df['CountryRegion'] = jhu_df['Country/Region'].combine_first(jhu_df['Country_Region'])

# Retrieve only US data
jhu_df = jhu_df[jhu_df['CountryRegion']=='US']

# Combine Province/State with Province_State
jhu_df['ProvinceState'] = jhu_df['Province/State'].combine_first(jhu_df['Province_State'])

# Drop unnecessary columns
jhu_df = jhu_df.drop(['Admin2', 'Lat', 'Latitude', 'Long_', 'Longitude', 'Combined_Key', 'Country/Region',
                      'Country_Region', 'Province/State', 'Province_State',
                      'Last Update', 'Last_Update', 'FIPS'], axis=1)

# Change column order
cols = list(jhu_df)
cols.insert(0, cols.pop(cols.index('CountryRegion')))
cols.insert(1, cols.pop(cols.index('ProvinceState')))
cols.insert(2, cols.pop(cols.index('LastUpdate')))
jhu_df = jhu_df.loc[:, cols]

# Change region to known US states
state_abbrs_dict = {}
for state in us.states.STATES:
    state_abbrs_dict[state.abbr] = state.name

def toState(input_state, mapping):
    abbreviation = input_state.rstrip()[-2:]
    try:
        return_value = mapping[abbreviation]
    except KeyError:
        return_value = input_state
    return return_value

jhu_df['ProvinceState'] = jhu_df['ProvinceState'].apply(lambda x: toState(x, state_abbrs_dict) if x != 'Washington, D.C.' else 'District of Columbia')

# Filter out unknown states
jhu_df = jhu_df[jhu_df['ProvinceState'].isin(all_cases.state.unique().tolist())]

# Merge-sum rows with same date and State
jhu_df = jhu_df.groupby(['LastUpdate', 'ProvinceState']).agg(
    {
        'Active': sum,
        'Confirmed': sum,
        'Deaths': sum,
        'Recovered': sum
    }
).reset_index()

jhu_df.tail()
LastUpdate ProvinceState Active Confirmed Deaths Recovered
5190 2020-06-19 Virginia 54652.0 56238.0 1586.0 0.0
5191 2020-06-19 Washington 25947.0 27192.0 1245.0 0.0
5192 2020-06-19 West Virginia 2330.0 2418.0 88.0 0.0
5193 2020-06-19 Wisconsin 23157.0 23876.0 719.0 0.0
5194 2020-06-19 Wyoming 1126.0 1144.0 18.0 0.0
# Now that we have the JHU dataset relatively cleaned
# we can go ahead and merge its data with our main dataset

for i, row in all_cases.iterrows():
    last_update = all_cases.at[i, 'date']
    state = all_cases.at[i, 'state']
    matching_row = jhu_df[jhu_df['ProvinceState'] == state]
    matching_row = matching_row[matching_row['LastUpdate'] == last_update].reset_index()

    if len(matching_row.values) > 0:
        #all_cases.at[i, 'positive'] = matching_row['Confirmed'].values[0]
        all_cases.at[i, 'active'] = matching_row['Active'].values[0]
        #all_cases.at[i, 'recovered'] = matching_row['Recovered'].values[0]   --- JHU was inconsistent, therefore removed
        #all_cases.at[i, 'death'] = matching_row['Deaths'].values[0]

    # Replace unknown recovery numbers with 0
    if np.isnan(row['recovered']):
        all_cases.at[i, 'recovered'] = 0

    if all_cases.at[i, 'active'] == 0 or np.isnan(row['active']):
        positive = all_cases.at[i, 'positive']
        recovered = all_cases.at[i, 'recovered']
        dead = all_cases.at[i, 'death']
        all_cases.at[i, 'active'] = positive - recovered - dead

all_cases.tail()
date state abbrev population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
5825 2020-01-26 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
5826 2020-01-25 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
5827 2020-01-24 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
5828 2020-01-23 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615
5829 2020-01-22 Washington WA 7797095 2.0 2.0 NaN NaN NaN NaN ... NaN NaN 0 0 0 0 0 NaN 1.7 13255.0615

5 rows × 29 columns

# Save formatted dataset offline in case of disaster
dataset_file = 'results/all_cases.csv'
all_cases.to_csv(dataset_file)
# convert date to datetime format
all_cases['date'] = pd.to_datetime(all_cases['date'])

An Exploratory data analysis of the US dataset

1. Basic triad of the dataset: investigating data types and number of rows per column

dataset_file = 'results/all_cases.csv'
covid_df = pd.read_csv(dataset_file, index_col=0) 
# convert date to datetime format
covid_df['date'] = pd.to_datetime(covid_df['date'])
covid_df.info()
# set float format to 3 decimals
pd.set_option('display.float_format', lambda x: '%.3f' % x)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5830 entries, 0 to 5829
Data columns (total 29 columns):
date                      5830 non-null datetime64[ns]
state                     5830 non-null object
abbrev                    5830 non-null object
population                5830 non-null int64
positive                  5830 non-null float64
active                    5830 non-null float64
hospitalizedCurrently     3551 non-null float64
hospitalizedCumulative    3162 non-null float64
inIcuCurrently            1833 non-null float64
onVentilatorCurrently     1630 non-null float64
recovered                 5830 non-null float64
dataQualityGrade          4896 non-null object
lastUpdateEt              5475 non-null object
dateModified              5475 non-null object
checkTimeEt               5475 non-null object
death                     5830 non-null float64
hospitalized              3162 non-null float64
totalTestsViral           1544 non-null float64
positiveTestsViral        515 non-null float64
negativeTestsViral        514 non-null float64
positiveCasesViral        3010 non-null float64
commercialScore           5830 non-null int64
negativeRegularScore      5830 non-null int64
negativeScore             5830 non-null int64
positiveScore             5830 non-null int64
score                     5830 non-null int64
grade                     0 non-null float64
bedsPerThousand           5830 non-null float64
total_beds                5830 non-null float64
dtypes: datetime64[ns](1), float64(16), int64(6), object(6)
memory usage: 1.3+ MB

Conclusion: onVentilatorCurrently, inICUCurrently and Hospitalized and HospitalizationCumulative contain many NaN values. we further inspect the NaN values by printing the tail of the dataset.

covid_df.head()
date state abbrev population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
0 2020-06-26 Alaska AK 734002 836.000 303.000 12.000 nan nan 2.000 ... nan nan 0 0 0 0 0 nan 2.200 1614.804
1 2020-06-26 Alabama AL 4908621 34183.000 14410.000 658.000 2653.000 nan nan ... nan 33717.000 0 0 0 0 0 nan 3.100 15216.725
2 2020-06-26 Arkansas AR 3038999 18062.000 5695.000 284.000 1300.000 nan 61.000 ... nan 18740.000 0 0 0 0 0 nan 3.200 9724.797
3 2020-06-26 Arizona AZ 7378494 66548.000 56624.000 2110.000 4514.000 581.000 312.000 ... nan 66055.000 0 0 0 0 0 nan 1.900 14019.139
4 2020-06-26 California CA 39937489 200461.000 194649.000 5639.000 nan 1570.000 nan ... nan 200461.000 0 0 0 0 0 nan 1.800 71887.480

5 rows × 29 columns

The NaN values may indicate that there were no to few Covid-19 patients at these date points. We further analyse the statistical values of the dataset columns.

covid_df.describe()
# TODO rounding up the numbers
population positive active hospitalizedCurrently hospitalizedCumulative inIcuCurrently onVentilatorCurrently recovered death hospitalized ... negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade bedsPerThousand total_beds
count 5830.000 5830.000 5830.000 3551.000 3162.000 1833.000 1630.000 5830.000 5830.000 3162.000 ... 514.000 3010.000 5830.000 5830.000 5830.000 5830.000 5830.000 0.000 5830.000 5830.000
mean 6543778.006 20671.992 18487.479 1032.840 4317.587 447.338 228.415 4318.329 1080.164 4317.587 ... 283412.642 31732.312 0.000 0.000 0.000 0.000 0.000 nan 2.625 15807.419
std 7387219.213 46209.844 41691.369 1942.496 12899.042 698.865 332.295 10788.010 2889.998 12899.042 ... 376950.308 56252.909 0.000 0.000 0.000 0.000 0.000 nan 0.744 16159.931
min 567025.000 0.000 0.000 1.000 0.000 2.000 0.000 0.000 0.000 0.000 ... 17.000 0.000 0.000 0.000 0.000 0.000 0.000 nan 1.600 1318.928
25% 1778070.000 624.250 537.250 121.000 218.250 83.000 36.000 0.000 12.000 218.250 ... 48368.750 4965.250 0.000 0.000 0.000 0.000 0.000 nan 2.100 3773.952
50% 4499692.000 4920.000 4439.000 402.000 946.000 186.000 94.000 189.000 140.000 946.000 ... 134948.500 13536.500 0.000 0.000 0.000 0.000 0.000 nan 2.500 11557.920
75% 7797095.000 20155.750 17265.250 1044.000 3159.500 492.000 252.500 2958.500 759.500 3159.500 ... 322553.250 34848.000 0.000 0.000 0.000 0.000 0.000 nan 3.100 19124.737
max 39937489.000 391220.000 356899.000 18825.000 89995.000 5225.000 2425.000 76282.000 24814.000 89995.000 ... 1943604.000 391220.000 0.000 0.000 0.000 0.000 0.000 nan 4.800 71887.480

8 rows × 22 columns

# drop unnecessary columns
covid_cleaned = covid_df.drop(['hospitalized', 'bedsPerThousand'], axis=1)
covid_100k = covid_cleaned.copy()
# list of columns to transform to per 100k
columns_list = ['positive', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']
# add columns per 100k
for column in columns_list:
    if column == 'total_beds':
        covid_100k['BedsPer100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000
    else:
        covid_100k['{}_100k'.format(column)] = (covid_cleaned.loc[:, column] / covid_cleaned.loc[:, 'population']) * 100000

covid_100k = covid_100k.drop(columns_list, axis=1)
covid_100k['date'] = pd.to_datetime(covid_100k['date'])
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_100k['date'] > start_date) & (covid_100k['date'] <= end_date)
covid_100k_last_month = covid_100k.loc[mask]
covid_100k_last_month_part1 =  covid_100k_last_month.groupby('date').sum().loc[:, ['positive_100k','active_100k','recovered_100k','death_100k','hospitalizedCumulative_100k']].diff(periods=1, axis=0)

covid_100k_last_month_part2 = covid_100k_last_month.groupby('date').sum().loc[:, ['inIcuCurrently_100k','onVentilatorCurrently_100k','BedsPer100k']]

final_100k_last_month = covid_100k_last_month_part1.merge(covid_100k_last_month_part2, left_index=True, right_index=True)
final_100k_last_month.head()
positive_100k active_100k recovered_100k death_100k hospitalizedCumulative_100k inIcuCurrently_100k onVentilatorCurrently_100k BedsPer100k
date
2020-04-19 nan nan nan nan nan 153.528 80.717 13440.000
2020-04-20 413.759 391.692 35.481 25.728 22.652 156.581 79.710 13440.000
2020-04-21 387.394 360.446 65.218 30.520 31.446 166.081 78.603 13440.000
2020-04-22 428.601 989.954 412.625 28.780 36.181 167.561 78.032 13440.000
2020-04-23 452.031 -2213.482 72.921 26.282 28.842 166.277 94.521 13440.000
final_100k_last_month.describe()
positive_100k active_100k recovered_100k death_100k hospitalizedCumulative_100k inIcuCurrently_100k onVentilatorCurrently_100k BedsPer100k
count 30.000 30.000 30.000 30.000 30.000 31.000 31.000 31.000
mean 399.188 364.943 147.172 23.271 39.160 134.117 73.503 13440.000
std 58.939 634.169 81.341 5.781 43.524 19.860 8.141 0.000
min 287.019 -2213.482 35.481 13.315 9.507 109.602 61.622 13440.000
25% 348.980 314.204 80.563 18.439 22.991 118.222 66.261 13440.000
50% 405.026 366.234 127.774 24.119 28.295 127.613 74.706 13440.000
75% 432.647 419.664 212.491 26.201 32.754 149.768 79.157 13440.000
max 544.349 2291.210 412.625 33.917 246.371 167.561 94.521 13440.000
# save description cleaned dataset to csv
describe_file = 'results/final_100k_last_month.csv'
final_100k_last_month.describe().to_csv(describe_file)

B. Graphical Exploratory Analysis

Plotting histograms, scatterplots and boxplots to assess the distribution of the dataset.

# Omitting the categorical (states/abbreviations) and time columns 
# There must be an easier way for you, but this was the easiest way I could think of
covid_cleaned['date'] = pd.to_datetime(covid_cleaned['date'])
# mask data for last month
start_date = '2020-04-18'
end_date = '2020-05-19'
mask = (covid_cleaned['date'] > start_date) & (covid_cleaned['date'] <= end_date)
covid_cleaned_last_month = covid_cleaned.loc[mask]
plot_df = covid_cleaned_last_month[['population', 'active', 'recovered', 'death', 'hospitalizedCurrently', 'inIcuCurrently', 'onVentilatorCurrently', 'total_beds']]
plot_df_last_month = covid_100k_last_month[['population', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']]
timeseries_usa_df = covid_100k.loc[:, ['date', 'positive_100k', 'active_100k', 'recovered_100k', 'death_100k', 'hospitalizedCurrently_100k', 'inIcuCurrently_100k', 'onVentilatorCurrently_100k', 'BedsPer100k']].groupby('date').sum().reset_index()
# timeseries_usa_df['log_positive'] = np.log(timeseries_usa_df['positive_100k'])
# timeseries_usa_df['log_active'] = np.log(timeseries_usa_df['active_100k'])
# timeseries_usa_df['log_recovered'] = np.log(timeseries_usa_df['recovered_100k'])
# timeseries_usa_df['log_death'] = np.log(timeseries_usa_df['death_100k'])
timeseries_usa_df.tail()
date positive_100k active_100k recovered_100k death_100k hospitalizedCurrently_100k inIcuCurrently_100k onVentilatorCurrently_100k BedsPer100k
152 2020-06-22 32457.545 18905.435 12018.320 1533.791 385.922 67.908 36.848 13440.000
153 2020-06-23 32860.514 19133.883 12183.722 1542.909 403.069 71.078 37.684 13440.000
154 2020-06-24 33315.285 19401.954 12359.391 1553.940 408.570 68.612 36.820 13440.000
155 2020-06-25 33812.912 19730.969 12498.864 1583.079 414.087 67.864 36.962 13440.000
156 2020-06-26 34335.924 20098.997 12643.998 1592.929 404.115 67.051 34.318 13440.000
x_dates = timeseries_usa_df['date'].tolist()
y_hospitalized = timeseries_usa_df['hospitalizedCurrently_100k'].tolist()
y_icu = timeseries_usa_df['inIcuCurrently_100k'].tolist()
y_pos = timeseries_usa_df['positive_100k'].tolist()
y_act = timeseries_usa_df['active_100k'].tolist()
y_rec = timeseries_usa_df['recovered_100k'].tolist()
y_death = timeseries_usa_df['death_100k'].tolist()
y_vent = timeseries_usa_df['onVentilatorCurrently_100k'].tolist()
y_beds = timeseries_usa_df['BedsPer100k'].tolist()

hospitalized_trace = go.Scatter(x=x_dates, y=y_hospitalized, name='Hospitalized')
icu_trace = go.Scatter(x=x_dates, y=y_icu, name='ICU')
pos_trace = go.Scatter(x=x_dates, y=y_pos, name='Positive')
act_trace = go.Scatter(x=x_dates, y=y_act, name='Active')
rec_trace = go.Scatter(x=x_dates, y=y_rec, name='Recovered')
death_trace = go.Scatter(x=x_dates, y=y_death, name='Death')
vent_trace = go.Scatter(x=x_dates, y=y_vent, name='On Ventilator')
beds_trace = go.Scatter(x=x_dates, y=y_beds, name='Beds')

layout = go.Layout(xaxis_title='Date', yaxis_title='Count/100k', title_x=0.5, xaxis_tickformat = '%d-%m-%Y')
fig = make_subplots(rows=1, cols=2)
fig = go.Figure([hospitalized_trace, icu_trace, vent_trace], layout=layout)
fig.show()
fig = go.Figure([pos_trace, act_trace, rec_trace, death_trace, beds_trace], layout=layout)
fig.show()
# bar plot top 10 states
# get most recent date
recent_date = covid_100k['date'].max()
# 10 states with largest active cases
largest_hospitalized = covid_100k[covid_100k.date == recent_date].nlargest(10, 'hospitalizedCumulative_100k')
largest_hospitalized_filter = largest_hospitalized[['state', 'positive_100k', 'hospitalizedCumulative_100k']].rename(columns={'state': 'State', 'positive_100k': 'positive', 'hospitalizedCumulative_100k': 'hospitalized'})
largest_hospitalized_filter = largest_hospitalized_filter.sort_values(by=['hospitalized']).set_index('State')
largest_hospitalized_filter.plot.barh(figsize=(8,5), color=['green', 'purple'])
<matplotlib.axes._subplots.AxesSubplot at 0x20b6aa50108>
covid_100k
date state abbrev population dataQualityGrade lastUpdateEt dateModified checkTimeEt totalTestsViral positiveTestsViral ... grade positive_100k active_100k recovered_100k death_100k hospitalizedCurrently_100k hospitalizedCumulative_100k inIcuCurrently_100k onVentilatorCurrently_100k BedsPer100k
0 2020-06-26 Alaska AK 734002 A 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 101792.000 nan ... nan 113.896 41.281 70.708 1.907 1.635 nan nan 0.272 220.000
1 2020-06-26 Alabama AL 4908621 B 6/26/2020 11:00 2020-06-26T11:00:00Z 06/26 07:00 nan nan ... nan 696.387 293.565 384.344 18.478 13.405 54.048 nan nan 310.000
2 2020-06-26 Arkansas AR 3038999 A 6/26/2020 14:40 2020-06-26T14:40:00Z 06/26 10:40 nan nan ... nan 594.340 187.397 399.046 7.897 9.345 42.777 nan 2.007 320.000
3 2020-06-26 Arizona AZ 7378494 A+ 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 479330.000 nan ... nan 901.918 767.419 113.695 20.804 28.597 61.178 7.874 4.229 190.000
4 2020-06-26 California CA 39937489 B 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 3771314.000 nan ... nan 501.937 487.384 0.000 14.553 14.120 nan 3.931 nan 180.000
5 2020-06-26 Colorado CO 5845526 A 6/25/2020 00:00 2020-06-25T00:00:00Z 06/24 20:00 nan nan ... nan 538.514 434.486 75.477 28.552 4.225 92.139 nan nan 190.000
6 2020-06-26 Connecticut CT 3563077 B 6/25/2020 20:30 2020-06-25T20:30:00Z 06/25 16:30 414889.000 nan ... nan 1292.675 945.784 226.013 120.879 3.564 288.178 nan nan 200.000
7 2020-06-26 District of Columbia DC 720687 A+ 6/25/2020 00:00 2020-06-25T00:00:00Z 06/24 20:00 nan nan ... nan 1413.235 1172.909 164.565 75.761 20.397 nan 6.660 4.024 440.000
8 2020-06-26 Delaware DE 982895 A 6/25/2020 18:00 2020-06-25T18:00:00Z 06/25 14:00 nan nan ... nan 1120.873 391.598 677.692 51.582 7.936 nan nan nan 220.000
9 2020-06-26 Florida FL 21992985 A 6/25/2020 23:59 2020-06-25T23:59:00Z 06/25 19:59 2107048.000 159793.000 ... nan 559.087 543.337 0.000 15.750 nan 64.934 nan nan 260.000
10 2020-06-26 Georgia GA 10736059 A 6/26/2020 14:50 2020-06-26T14:50:00Z 06/26 10:50 771028.000 66510.000 ... nan 679.905 654.104 0.000 25.801 11.028 98.779 nan nan 240.000
11 2020-06-26 Hawaii HI 1412687 D 6/25/2020 18:00 2020-06-25T18:00:00Z 06/25 14:00 84910.000 850.000 ... nan 60.169 9.698 49.268 1.203 nan 7.716 nan nan 190.000
12 2020-06-26 Iowa IA 3179849 A+ 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 nan nan ... nan 870.670 304.071 544.460 22.139 4.434 nan 1.321 0.755 300.000
13 2020-06-26 Idaho ID 1826156 A 6/25/2020 19:00 2020-06-25T19:00:00Z 06/25 15:00 81871.000 nan ... nan 266.407 58.210 203.269 4.928 nan 16.592 nan nan 190.000
14 2020-06-26 Illinois IL 12659682 A 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 1490952.000 nan ... nan 1116.489 1060.817 0.000 55.673 11.975 nan 3.160 1.777 250.000
15 2020-06-26 Indiana IN 6745354 A+ 6/25/2020 23:59 2020-06-25T23:59:00Z 06/25 19:59 nan nan ... nan 654.376 120.809 495.096 38.471 10.259 102.945 4.166 1.394 270.000
16 2020-06-26 Kansas KS 2910357 A 6/26/2020 10:00 2020-06-26T10:00:00Z 06/26 06:00 nan nan ... nan 465.166 429.329 26.766 9.071 nan 38.758 nan nan 330.000
17 2020-06-26 Kentucky KY 4499692 B 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 nan nan ... nan 330.223 235.038 82.895 12.290 8.601 57.537 1.645 nan 320.000
18 2020-06-26 Louisiana LA 4645184 B 6/26/2020 13:00 2020-06-26T13:00:00Z 06/26 09:00 nan nan ... nan 1179.049 253.747 856.629 68.673 15.069 nan nan 1.572 330.000
19 2020-06-26 Massachusetts MA 6976597 A+ 6/26/2020 10:00 2020-06-26T10:00:00Z 06/26 06:00 1020867.000 nan ... nan 1549.036 1434.181 0.000 114.855 11.338 161.841 2.236 1.419 230.000
20 2020-06-26 Maryland MD 6083116 A 6/26/2020 10:00 2020-06-26T10:00:00Z 06/26 06:00 613513.000 nan ... nan 1086.861 954.609 80.600 51.651 8.006 176.308 3.123 nan 190.000
21 2020-06-26 Maine ME 1345790 A 6/25/2020 23:59 2020-06-25T23:59:00Z 06/25 19:59 89510.000 3787.000 ... nan 230.497 33.958 188.885 7.653 2.081 25.487 0.669 0.446 250.000
22 2020-06-26 Michigan MI 10045029 A+ 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 998490.000 86113.000 ... nan 690.182 138.427 490.690 61.065 5.545 nan 1.921 1.055 250.000
23 2020-06-26 Minnesota MN 5700671 A 6/25/2020 17:00 2020-06-25T17:00:00Z 06/25 13:00 557278.000 nan ... nan 607.227 55.467 526.394 25.365 5.877 69.571 2.754 nan 250.000
24 2020-06-26 Missouri MO 6169270 B 6/26/2020 15:00 2020-06-26T15:00:00Z 06/26 11:00 424214.000 23527.000 ... nan 322.793 306.746 0.000 16.047 9.726 nan nan 1.070 310.000
25 2020-06-26 Mississippi MS 2989260 A 6/25/2020 19:00 2020-06-25T19:00:00Z 06/25 15:00 271734.000 nan ... nan 838.535 227.548 576.798 34.189 25.725 101.831 5.118 3.011 400.000
26 2020-06-26 Montana MT 1086759 C 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 nan nan ... nan 76.282 20.060 54.198 2.024 1.288 8.742 nan nan 330.000
27 2020-06-26 North Carolina NC 10611862 A 6/26/2020 11:55 2020-06-26T11:55:00Z 06/26 07:55 nan nan ... nan 554.267 194.066 347.922 12.279 8.406 nan nan nan 210.000
28 2020-06-26 North Dakota ND 761723 D 6/26/2020 00:00 2020-06-26T00:00:00Z 06/25 20:00 169838.000 nan ... nan 449.113 32.033 405.659 11.421 3.019 29.144 nan nan 430.000
29 2020-06-26 Nebraska NE 1952570 B 6/25/2020 19:55 2020-06-25T19:55:00Z 06/25 15:55 nan nan ... nan 939.582 295.610 630.656 13.316 6.453 66.272 nan nan 360.000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5800 2020-02-20 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 1.052 1.052 0.000 0.000 nan nan nan nan 170.000
5801 2020-02-19 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.885 0.885 0.000 0.000 nan nan nan nan 170.000
5802 2020-02-18 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.757 0.757 0.000 0.000 nan nan nan nan 170.000
5803 2020-02-17 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.641 0.641 0.000 0.000 nan nan nan nan 170.000
5804 2020-02-16 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.449 0.449 0.000 0.000 nan nan nan nan 170.000
5805 2020-02-15 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.359 0.359 0.000 0.000 nan nan nan nan 170.000
5806 2020-02-14 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.269 0.269 0.000 0.000 nan nan nan nan 170.000
5807 2020-02-13 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.231 0.231 0.000 0.000 nan nan nan nan 170.000
5808 2020-02-12 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.231 0.231 0.000 0.000 nan nan nan nan 170.000
5809 2020-02-11 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.218 0.218 0.000 0.000 nan nan nan nan 170.000
5810 2020-02-10 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.205 0.205 0.000 0.000 nan nan nan nan 170.000
5811 2020-02-09 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.167 0.167 0.000 0.000 nan nan nan nan 170.000
5812 2020-02-08 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.167 0.167 0.000 0.000 nan nan nan nan 170.000
5813 2020-02-07 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.154 0.154 0.000 0.000 nan nan nan nan 170.000
5814 2020-02-06 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.141 0.141 0.000 0.000 nan nan nan nan 170.000
5815 2020-02-05 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.103 0.103 0.000 0.000 nan nan nan nan 170.000
5816 2020-02-04 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.103 0.103 0.000 0.000 nan nan nan nan 170.000
5817 2020-02-03 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.090 0.090 0.000 0.000 nan nan nan nan 170.000
5818 2020-02-02 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.077 0.077 0.000 0.000 nan nan nan nan 170.000
5819 2020-02-01 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.051 0.051 0.000 0.000 nan nan nan nan 170.000
5820 2020-01-31 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.038 0.038 0.000 0.000 nan nan nan nan 170.000
5821 2020-01-30 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.038 0.038 0.000 0.000 nan nan nan nan 170.000
5822 2020-01-29 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.038 0.038 0.000 0.000 nan nan nan nan 170.000
5823 2020-01-28 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.026 0.026 0.000 0.000 nan nan nan nan 170.000
5824 2020-01-27 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.026 0.026 0.000 0.000 nan nan nan nan 170.000
5825 2020-01-26 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.026 0.026 0.000 0.000 nan nan nan nan 170.000
5826 2020-01-25 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.026 0.026 0.000 0.000 nan nan nan nan 170.000
5827 2020-01-24 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.026 0.026 0.000 0.000 nan nan nan nan 170.000
5828 2020-01-23 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.026 0.026 0.000 0.000 nan nan nan nan 170.000
5829 2020-01-22 Washington WA 7797095 NaN NaN NaN NaN nan nan ... nan 0.026 0.026 0.000 0.000 nan nan nan nan 170.000

5830 rows × 27 columns

# Timeseries plot
fig, ax = plt.subplots(figsize = (8, 5))
ax.plot(covid_df.date, covid_df.hospitalizedCurrently)
ax.set_title('Number of USA Patients Currently in ICU')
# TODO change the X-axis to chip the year
# TODO which timeseries plots do we need?
# TODO how to temporarily remove the outliers?
Text(0.5, 1.0, 'Number of USA Patients Currently in ICU')
# get data from last day
# plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18'] 

# Plotting histograms to gain insight of the distribution shape, skewness and scale
fig, axs = plt.subplots(4,2,figsize = (10, 10))
sns.set()
for i, column in enumerate(plot_df_last_month.columns):
    if (i + 1) % 2 == 0:
        ax = axs[(i//2), 1]
    else:
        ax = axs[(i//2), 0]
    sns.distplot(plot_df_last_month[column], fit=norm, fit_kws=dict(label='normality'), hist_kws=dict(color='plum', edgecolor='k', linewidth=1, label='frequency'), ax=ax, color='#9d53ad')
    ax.legend(loc='upper right')
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Looking at linearity and variance with scatterplots
# Removing the target variable and saving it in another df
target = plot_df.hospitalizedCurrently
indep_var = plot_df.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (10, 8))
for i, col in enumerate(indep_var.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.regplot(x=indep_var[col], y=target, data=indep_var, label=col, scatter_kws={'s':10}, line_kws={"color": "plum", 'label': 'hospitCurr'})
    plt.suptitle('Scatterplots with Target HospitalizedCurrently', fontsize=18)
    plt.legend()
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Assessing the normality of the distribution with a boxplot
# Boxplot with removed outliers
fig, ax = plt.subplots(figsize = (10, 8))
for i, col in enumerate(plot_df.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.boxplot(x=plot_df[col], data=plot_df, color='lightblue', showfliers=False)
    plt.suptitle('Boxplots of Independent Variables', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# get data from last day
plot_df_last_date = plot_df.loc[covid_df['date'] == '2020-05-18'] 

fig, ax = plt.subplots(figsize = (10, 8))
for i, col in enumerate(plot_df_last_date.columns):
    ax=fig.add_subplot(4, 3, i+1) 
    sns.boxplot(x=plot_df_last_date[col], data=plot_df, color='lightblue', showfliers=True)
    plt.suptitle('Boxplots of Independent Variables', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

New York and California Analysis

Since the normality of the independent variables is highly questionable, we further assess the distribution by comparing two states (NY and CA) with eachother.

We start with constructing scatter plots and boxplots of New York.

# Split covid_df into subset with only NY values
new_york = covid_df.loc[covid_df['abbrev'] == 'NY'] 

# Timeseries plt
plt.plot(new_york.date, new_york.hospitalizedCurrently)
plt.title('Number of Patients in NY Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')

# TODO the code below can be deleted once we fix the date format in plots
#set ticks every week
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
#set major ticks format
ax.xaxis.set_major_formatter(mdates.DateFormatter('%m'))
# Omit the categorical and date cols 
new_york = new_york[['positive', 'active', 'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
# Scatter plots NY
# Split dependent var from independent variables
target_ny = new_york.hospitalizedCurrently
indep_var_ny = new_york.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (10, 8))
for i, col in enumerate(indep_var_ny.columns):
    ax=fig.add_subplot(3, 3, i+1) 
    sns.regplot(x=indep_var_ny[col], y=target_ny, data=indep_var_ny, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables NY', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of NY
fig, ax = plt.subplots(figsize = (10, 8))
for i, col in enumerate(new_york.columns):
    ax=fig.add_subplot(3, 3, i+1) 
    sns.boxplot(x=new_york[col], data=new_york, color='lightpink', showfliers=True)
    plt.suptitle('Boxplots of Independent Variables NY', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

Moving onto California:

cali = covid_df.loc[(covid_df['abbrev'] == 'CA') & (covid_df['state']== 'California')] 
# TODO fix legend/axis/plot alltogether
# Timeseries plt
plt.plot(cali.date, cali.hospitalizedCurrently)
plt.title('Number of Patients in CA Currently Hospitalized')
plt.xlabel('Date')
plt.ylabel('No. Patients')
Text(0, 0.5, 'No. Patients')
# Checking which cols have NaN values
cali[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death', 'hospitalized']]
cali.head()

# Omit the NaN cols
cali = cali[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'recovered', 'death']]
# Scatter plots CA
# Split dependent var from independent variables
target_ca = cali.hospitalizedCurrently
indep_var_ca = cali.drop(columns=['hospitalizedCurrently'])

fig, ax = plt.subplots(figsize = (10, 8))
for i, col in enumerate(indep_var_ca.columns):
    ax=fig.add_subplot(2, 3, i+1) 
    sns.regplot(x=indep_var_ca[col], y=target_ca, data=indep_var_ca, label=col, scatter_kws={'s':10}, line_kws={"color": "plum"})
    plt.suptitle('Distributions of Independent Variables CA', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)
# Boxplot of CA
cali = cali[['positive', 'active', 'hospitalizedCurrently', 'inIcuCurrently', 'death']]
fig, ax = plt.subplots(figsize = (10, 8))
for i, col in enumerate(cali.columns):
    ax=fig.add_subplot(3, 2, i+1) 
    sns.boxplot(x=cali[col], data=cali, color='lightgrey', showfliers=True)
    plt.suptitle('Boxplots CA', fontsize=18)
plt.tight_layout()
fig.subplots_adjust(top=0.95)

Assessing Correlation of Independent Variables

# TODO add some explanation / look more into collinear variables
# Heatmap of correlations
# Save correlations to variable
corr = covid_cleaned.corr(method='pearson')
# We can create a mask to not show duplicate values
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(8,8))

# Generate heatmap
sns.heatmap(corr, annot=True, mask=mask, cmap='GnBu', center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
<matplotlib.axes._subplots.AxesSubplot at 0x20b6d151788>

Step 3: Build model for dependent Variable

  • To be used to predict hospitalizedCurrently
  • Having more complete variables for inICUCurrently and onVentilatorCurrently will allow us to predict these numbers as well
# We compare three models:
# - Polynomial Regression
# - Linear Regression
# - ElasticNet

# Copy DFs to not mess up original one
# We will use model_df for our regression model
model_df = all_cases.copy()

# Delete redundant rows
for row in ['abbrev', 'bedsPerThousand', 'hospitalized', 
'state', 'hospitalizedCumulative', 'dataQualityGrade', 'lastUpdateEt']:
    del model_df[row]

# Drop NaN values for hospitalizedCurrently
model_df = model_df.dropna(subset=['hospitalizedCurrently'])

# Drop Values with abnormal active-hospitalised ratios (outside Conf. Interval)
model_df['ratio_hospital'] = model_df['hospitalizedCurrently'] / model_df['active']
model_df = model_df[~(model_df['ratio_hospital'] >= model_df.ratio_hospital.quantile(0.99))]

#model_df = model_df[~(model_df['ratio_hospital'] <= model_df['ratio_hospital'].median())]
del model_df['ratio_hospital']

# Get peek of model to use
model_df.describe()
population positive active hospitalizedCurrently inIcuCurrently onVentilatorCurrently recovered death totalTestsViral positiveTestsViral negativeTestsViral positiveCasesViral commercialScore negativeRegularScore negativeScore positiveScore score grade total_beds
count 3515.000 3515.000 3515.000 3515.000 1784.000 1583.000 3515.000 3515.000 1079.000 383.000 382.000 2441.000 3515.000 3515.000 3515.000 3515.000 3515.000 0.000 3515.000
mean 6742215.381 31355.048 27858.963 1029.290 444.145 225.527 6809.578 1696.697 388488.307 24974.708 235269.796 36143.066 0.000 0.000 0.000 0.000 0.000 nan 16019.574
std 7749524.762 56347.216 50875.985 1946.232 704.719 334.798 13206.737 3575.315 523078.795 25922.592 225340.603 60974.372 0.000 0.000 0.000 0.000 0.000 nan 16533.007
min 567025.000 115.000 113.000 1.000 2.000 0.000 0.000 0.000 9055.000 407.000 8648.000 396.000 0.000 0.000 0.000 0.000 0.000 nan 1318.928
25% 1778070.000 3203.000 2818.000 117.500 81.000 34.500 0.000 88.000 85635.000 4128.000 62359.250 6344.000 0.000 0.000 0.000 0.000 0.000 nan 3773.952
50% 4645184.000 11948.000 10014.000 402.000 181.000 92.000 1252.000 466.000 213753.000 13853.000 167221.500 16083.000 0.000 0.000 0.000 0.000 0.000 nan 11557.920
75% 8626207.000 34570.000 29929.500 1033.500 476.750 244.000 6102.000 1579.000 472451.000 43427.000 286083.250 40022.000 0.000 0.000 0.000 0.000 0.000 nan 19124.737
max 39937489.000 391220.000 356899.000 18825.000 5225.000 2425.000 76282.000 24814.000 3771314.000 86113.000 912377.000 391220.000 0.000 0.000 0.000 0.000 0.000 nan 71887.480